Load packages
library(tidyverse)
library(tidytext)
library(igraph)
library(ggraph)
library(stringr)
library(widyr)
library(knitr)
library(topicmodels)
Get data.
dat <- read_csv("data/final_corpus.csv") %>%
filter(!is.na(title))
Make a data frame that lists all the authors in separate columns for each paper.
author_df <- dat %>% select(id, authors) %>%
unnest_tokens(output = authors_long, input = authors, token = stringr::str_split, pattern = ";")
Convert each author name to “last, first initial” format.
author_df <- author_df %>%
mutate(authors_long = str_trim(authors_long)) %>%
mutate(last = ifelse(grepl(",", authors_long) == TRUE,
str_extract(authors_long, "[^,]*"),
str_extract(authors_long, "[^ ]*$"))) %>%
mutate(first_init = ifelse(grepl(",", authors_long) == TRUE,
strsplit(authors_long, " "),
str_sub(authors_long, start = 1, end = 1))) %>%
unnest(first_init) %>%
group_by(authors_long, id) %>%
slice(2) %>%
ungroup() %>%
arrange(id) %>%
mutate(first_init = str_sub(first_init, 1, 1)) %>%
mutate(author = paste0(first_init, ". ", last)) %>%
select(-last, -first_init)
kable(head(author_df))
| id | authors_long | author |
|---|---|---|
| 17 | cannon, ellie | e. cannon |
| 17 | copenhaver-parry, paige e. | p. copenhaver-parry |
| 18 | betts, matthew g. | m. betts |
| 18 | frey, sarah j. k. | s. frey |
| 18 | hadley, adam s. | a. hadley |
| 20 | barnhart, theodore b. | t. barnhart |
write_csv(author_df, "data/authors.csv")
Now column “author” contains the most standard version. It looks like we have 1966 unique authors.
Get pairwise author count:
author_pairs <- author_df %>%
pairwise_count(author, id, sort = TRUE, upper = FALSE)
names(author_pairs)[1:2] <- c("author1", "author2")
kable(head(author_pairs))
| author1 | author2 | n |
|---|---|---|
| w. romme | m. turner | 9 |
| j. bradford | w. lauenroth | 6 |
| a. hamlet | d. lettenmaier | 6 |
| l. leung | y. qian | 6 |
| d. horan | d. isaak | 5 |
| d. isaak | d. nagel | 5 |
Make a yearly version of author pairs as well:
author_df <- full_join(author_df, dat[, c("id", "year")], by = "id")
author_df <- author_df %>% filter(!is.na(year))
years <- unique(author_df$year)
pair_dfs <- vector("list", length(years))
for(i in 1:length(years)){
df <- author_df %>% filter(year == years[i])
pair_dfs[[i]] <- df %>%
pairwise_count(author, id, sort = TRUE, upper = FALSE) %>%
mutate(year = years[i])
}
author_pairs_years <- bind_rows(pair_dfs)
names(author_pairs_years)[1:2] <- c("author1", "author2")
#Subset to years with > 50 author collaborations in our dataset.
#(save a copy with the full dataset for later).
apy_full <- author_pairs_years
years <- author_pairs_years %>% group_by(year) %>% count() %>% filter(nn > 50)
author_pairs_years <- author_pairs_years %>% filter(year %in% years$year)
Join authors with subject data (using journal, discipline, keyword?). A possible approach here is to do some topic modeling with the keywords (276 papers are missing keyword data), categorize papers by topic, and then do network analysis grouped by topic. This is probably a good idea anyway because we want to split papers by topic for when we code for content.
keyword_df <- dat %>%
dplyr::select(id, keywords) %>%
mutate(keywords = gsub(",", ";", keywords)) %>%
unnest_tokens(input = keywords, output = keywords, token = stringr::str_split, pattern = ";") %>%
mutate(keywords = str_trim(keywords)) %>%
filter(!is.na(keywords))
We’ve got 2136 unique keywords. Let’s look at keyword pairs to see how they’re grouped.
keyword_pairs <- keyword_df %>%
pairwise_count(keywords, id, sort = TRUE, upper = FALSE)
set.seed(1234)
keyword_pairs %>%
filter(n >= 10) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "salmon") +
geom_node_point(size = 2) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Wow, that keyword network seems to be very centralized. And I worry a little bit that the structure we see here is just an artifact of how people happen to decide on topics.
keyword_cors <- keyword_df %>%
group_by(keywords) %>%
#filter(n() >= 50) %>%
pairwise_cor(keywords, id, sort = TRUE, upper = FALSE)
#Take out keyword correlations with correlation 1; these are redundant.
keyword_cors <- keyword_cors %>%
filter(round(correlation, 3) < 1)
kable(head(keyword_cors))
| item1 | item2 | correlation |
|---|---|---|
| plant pathogenic fungi | plant pathogens | 0.9347054 |
| fungal diseases | plant pathogenic fungi | 0.9251195 |
| plant pests | insect pests | 0.9135562 |
| aquatic plants | phytoplankton | 0.9121812 |
| air pollutants | air pollution | 0.9121812 |
| arthropod pests | pests | 0.9031583 |
Visualize the network of keyword correlations:
set.seed(1234)
keyword_cors %>%
filter(correlation > .7) %>%
graph_from_data_frame() %>%
ggraph(layout = "kk") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "darkorchid") +
geom_node_point(size = 1) +
# geom_node_text(aes(label = name), repel = FALSE,
#point.padding = unit(0.2, "lines"),
# size = 2) +
theme_void()
Well, that’s extremely difficult to read, but it does look like there’s some more meaningful grouping with correlations than there was with just pairwise analysis.
Let’s move on to some topic modeling to see if we can group papers. First, define stop words in addition to the common ones, and get word counts.
my_stop_words <- data_frame(word = c("climate change", "usa"),
lexicon = rep("custom", 2))
word_counts <- keyword_df %>%
rename(word = keywords) %>%
anti_join(my_stop_words) %>%
count(id, word, sort = TRUE) %>%
ungroup() %>%
arrange(-n)
## Joining, by = "word"
word_counts
## # A tibble: 6,461 x 3
## id word n
## <int> <chr> <int>
## 1 1194 dissolved organic 2
## 2 17 bayesian model 1
## 3 17 distribution edge 1
## 4 17 distribution shift 1
## 5 17 plant performance 1
## 6 17 sensitivity analysis 1
## 7 18 composition 1
## 8 18 dynamic occupancy models 1
## 9 18 forest bird distributions 1
## 10 18 forest structure and 1
## # ... with 6,451 more rows
This topic modeling approach may not make a lot of sense since keywords generally only appear once… but continue anyway.
keyword_dtm <- word_counts %>%
cast_dtm(id, word, n)
keyword_lda <- LDA(keyword_dtm, k = 8, control = list(seed = 1234))
tidy_lda <- tidy(keyword_lda)
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
mutate(term = factor(paste(term, topic, sep = "__"),
levels = rev(paste(term, topic, sep = "__")))) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 4, scales = "free")
These look like they’re somewhat informative, but imperfect. They would probably be more informative if we did this same procedure with full texts or abstracts (which might be a good next step, using crminer).
For now, can we categorize each paper by its topic?
lda_gamma <- tidy(keyword_lda, matrix = "gamma")
id_topic <- lda_gamma %>%
group_by(document) %>%
filter(gamma == max(gamma)) %>%
ungroup() %>%
select(document, topic) %>%
rename(id = document) %>%
mutate(id = as.integer(id))
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
network_df <- left_join(author_df, id_topic, by = "id")
author_topics <- network_df %>%
group_by(author) %>%
summarise(topic_new = Mode(topic))
kable(head(author_topics))
| author | topic_new |
|---|---|
| a. adams | 5 |
| a. adeloye | 1 |
| a. ager | 5 |
| a. aldous | 6 |
| a. andrade | 5 |
| a. armstrong | 7 |
#author_pairs <- author_df %>%
# pairwise_count(author, id, sort = TRUE, upper = FALSE)
#names(author_pairs)[1:2] <- c("author1", "author2")
#kable(head(author_pairs))
abstract_df <- dat %>% select(id, abstract) %>%
filter(!is.na(abstract)) %>%
unnest_tokens(word, abstract) %>%
anti_join(stop_words)
## Joining, by = "word"
abstract_df %>% count(word, sort = TRUE)
## # A tibble: 9,257 x 2
## word n
## <chr> <int>
## 1 climate 2000
## 2 change 1067
## 3 species 772
## 4 temperature 720
## 5 precipitation 646
## 6 model 597
## 7 water 578
## 8 forest 544
## 9 fire 498
## 10 1999 486
## # ... with 9,247 more rows
#Looks like there are some numbers... maybe get rid of those.
#Also, change everything to lower case.
abstract_df <- abstract_df %>%
filter(!grepl("[[:digit:]]", word)) %>%
mutate(word = tolower(word))
#Need to add a few more stop words:
abstract_df <- abstract_df %>%
filter(!word %in% c("org", "http", "xhtml", "xmlns"))
library(widyr)
abstract_word_pairs <- abstract_df %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
#this is slow - only run if needed.
# abstract_cors <- abstract_df %>%
# group_by(word) %>%
# #filter(n() >= 50) %>%
# pairwise_cor(word, id, sort = TRUE, upper = FALSE)
#Calculate tf_idf.
abstract_tf_idf <- abstract_df %>%
count(id, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, id, n)
word_counts <- abstract_df %>%
count(id, word, sort = TRUE) %>%
ungroup()
word_counts
## # A tibble: 79,097 x 3
## id word n
## <int> <chr> <int>
## 1 2375 glacier 22
## 2 2444 glacier 22
## 3 615 degrees 19
## 4 2475 aspen 18
## 5 2875 species 18
## 6 3078 water 18
## 7 99 recharge 17
## 8 194 species 17
## 9 700 stream 17
## 10 821 snow 17
## # ... with 79,087 more rows
abstract_dtm <- word_counts %>%
cast_dtm(id, word, n)
#This may take a while.
abstract_lda <- LDA(abstract_dtm, k = 8, control = list(seed = 1234))
tidy_lda <- tidy(abstract_lda)
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 80 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 fire 0.033224806
## 2 1 forest 0.017407021
## 3 1 carbon 0.011841713
## 4 1 forests 0.010755105
## 5 1 soil 0.009741968
## 6 1 severity 0.008870568
## 7 1 burned 0.008357221
## 8 1 disturbance 0.007976146
## 9 1 wildfire 0.007410203
## 10 1 fires 0.007398807
## # ... with 70 more rows
top_terms %>%
mutate(term = reorder(term, beta)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
mutate(term = factor(paste(term, topic, sep = "__"),
levels = rev(paste(term, topic, sep = "__")))) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 4, scales = "free")
lda_gamma <- tidy(abstract_lda, matrix = "gamma")
ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
geom_histogram(show.legend = FALSE, bins = 40) +
facet_wrap(~ topic, ncol = 4) +
scale_y_log10() +
labs(title = "Distribution of probability for each topic",
y = "Number of documents", x = expression(gamma))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7 rows containing missing values (geom_bar).
It doesn’t look like these are very well separated: in an ideal world where each document fits perfectly into one category, we would just have bars at the far left and right sides of each facet.
Repeat the topic-modeling with term frequency-inverse document frequency (TF-IDF) instead of word counts:
## # A tibble: 80 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 fire 0.011607530
## 2 1 forest 0.005911603
## 3 1 stand 0.005449802
## 4 1 severity 0.005425651
## 5 1 burned 0.005305420
## 6 1 fires 0.005079615
## 7 1 aspen 0.004897655
## 8 1 carbon 0.004886524
## 9 1 beetle 0.004877169
## 10 1 wildfire 0.004625715
## # ... with 70 more rows
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 11 rows containing missing values (geom_bar).
I’ve played around a little with different numbers of topics, and I don’t think the gammas are a lot better with fewer than more topics - although, selecting only four topics (for example) makes the topics a lot more intelligible to me.
Take a look at how the most common terms in abstracts have changed over time.
#Add year to abstract word counts, and count number of times each word occurs in each year.
df <- full_join(abstract_df, author_df[,c("id", "year")], by = "id") %>%
group_by(year, word) %>%
count() %>%
arrange(year, desc(n))
#Figure out most common terms and see how they've changed over time (could also choose terms of interest in another way):
top_words <- df %>%
group_by(word) %>%
summarise(total = sum(n)) %>%
arrange(desc(total))
top <- top_words[1:10,]
plot_dat <- df %>% filter(word %in% top$word) %>%
filter(year < 2017) %>%
filter(year > 1980)
ggplot(plot_dat, aes(x = year, y = n, group = word, color = word)) +
geom_line()
If we’re interested in doing more with this, two good next steps would be to normalize the data by total number of words in the corpus for each year (the corpus gets bigger over time, so words naturally become more frequent) and to think more about what words we’re actually interested in. It might be that combinations of words are really the most interesting (bark beetle? forest carbon? stream temperature?).
Another thing we can do is look at which keywords or abstract words act as bridging concepts by treating the keywords or terms in abstracts as nodes in a network, where the edges are papers. Let’s start with keywords for simplicity. Earlier, we already made a network graph of keywords. Another way of assessing this is by testing the centrality metrics for different keywords.
graph_obj <- graph_from_data_frame(keyword_pairs, directed = FALSE)
ec <- eigen_centrality(graph_obj)[[1]]
ec_df <- data.frame(ec)
ec_df <- ec_df %>% mutate(name = rownames(ec_df)) %>%
arrange(desc(ec))
names(ec_df[1]) <- "eigen_centrality"
bet <- betweenness(graph_obj) %>%
data.frame()
bet <- bet %>% mutate(name = rownames(bet))
names(bet)[1] <- "betweenness"
deg <- degree(graph_obj) %>%
data.frame()
deg <- deg %>% mutate(name = rownames(deg))
names(deg)[1] <- "degree_centrality"
keyword_stats <- full_join(ec_df, bet, by = "name") %>%
full_join(deg, by = "name") %>%
select(name, everything()) %>%
arrange(desc(degree_centrality))
#Add the frequency of each keyword for reference.
freq <- keyword_df %>%
group_by(keywords) %>%
count() %>%
rename("name" = "keywords")
keyword_stats <- left_join(keyword_stats, freq, by = "name")
kable(keyword_stats[1:20,])
| name | ec | betweenness | degree_centrality | n |
|---|---|---|---|---|
| climate change | 1.0000000 | 1208146.119 | 1422 | 444 |
| climate | 0.8519273 | 128232.473 | 700 | 176 |
| forests | 0.7401325 | 39942.966 | 528 | 96 |
| temperature | 0.6805871 | 40075.872 | 466 | 88 |
| global warming | 0.5490619 | 63682.013 | 393 | 59 |
| models | 0.6147894 | 14087.519 | 364 | 73 |
| ecology | 0.5943023 | 11313.304 | 358 | 50 |
| ecosystems | 0.5983748 | 10623.358 | 351 | 47 |
| effects | 0.5861596 | 9475.889 | 336 | 38 |
| precipitation | 0.5288523 | 35094.003 | 336 | 65 |
| pines | 0.5275373 | 12387.995 | 322 | 52 |
| snow | 0.4901485 | 23360.509 | 293 | 48 |
| hydrology | 0.4823323 | 19940.503 | 290 | 53 |
| drought | 0.4627442 | 19488.918 | 290 | 41 |
| woody plants | 0.4961344 | 7955.838 | 286 | 32 |
| national parks | 0.4822818 | 14362.226 | 271 | 33 |
| wildfires | 0.4990651 | 6896.379 | 264 | 34 |
| mountain areas | 0.4731524 | 8254.384 | 261 | 40 |
| watersheds | 0.4673826 | 10536.409 | 257 | 41 |
| mountains | 0.4698888 | 11721.581 | 255 | 35 |
The highest keywords in this list should be those that are the best connected to other keywords - that act as bridging concepts. However, they may also just be the most common ones. Another way to think about this is to see which keywords rank the lowest - which are not bridging concepts?
kable(tail(keyword_stats, 20))
| name | ec | betweenness | degree_centrality | n | |
|---|---|---|---|---|---|
| 2116 | paleoclimatology | 0.0021559 | 0 | 2 | 1 |
| 2117 | alberta | 0.0015828 | 0 | 2 | 1 |
| 2118 | national forest | 0.0015050 | 0 | 2 | 1 |
| 2119 | glacial geomorphology | 0.0009659 | 0 | 2 | 1 |
| 2120 | landscape evolution | 0.0009659 | 0 | 2 | 1 |
| 2121 | monitoring trends in burn severity | 0.0007862 | 0 | 2 | 1 |
| 2122 | forest disturbances | 0.0007862 | 0 | 2 | 1 |
| 2123 | timing of spring | 0.0005932 | 0 | 2 | 1 |
| 2124 | alpine treeline ecotone | 0.0000000 | 0 | 2 | 1 |
| 2125 | abiotic treeline controls | 0.0000000 | 0 | 2 | 1 |
| 2126 | dynamic response | 0.0000000 | 0 | 2 | 1 |
| 2127 | precipitation timing | 0.0000000 | 0 | 2 | 1 |
| 2128 | bacterial and fungal soil community composition | 0.0000000 | 0 | 2 | 1 |
| 2129 | environmental and vegetation effects | 0.0000000 | 0 | 2 | 1 |
| 2130 | scale dependence | 0.0000000 | 0 | 2 | 1 |
| 2131 | soil and atmospheric drought | 0.0000000 | 0 | 2 | 1 |
| 2132 | forest and meadow soils | 0.0000000 | 0 | 2 | 1 |
| 2133 | orographic precipitation | 0.0086511 | 0 | 1 | 1 |
| 2134 | climax communities | 0.0014596 | 0 | 1 | 1 |
| 2135 | western forest insects | 0.0001061 | 0 | 1 | 1 |
Maybe there’s a way to control for the frequency of keywords, if that’s something we’re concerned about.
Do we find the same patterns with abstract words as with keywords?
graph_obj <- graph_from_data_frame(abstract_word_pairs, directed = FALSE)
ec <- eigen_centrality(graph_obj)[[1]]
ec_df <- data.frame(ec)
ec_df <- ec_df %>% mutate(name = rownames(ec_df)) %>%
arrange(desc(ec))
names(ec_df[1]) <- "eigen_centrality"
bet <- betweenness(graph_obj) %>%
data.frame()
bet <- bet %>% mutate(name = rownames(bet))
names(bet)[1] <- "betweenness"
deg <- degree(graph_obj) %>%
data.frame()
deg <- deg %>% mutate(name = rownames(deg))
names(deg)[1] <- "degree_centrality"
abstract_stats <- full_join(ec_df, bet, by = "name") %>%
full_join(deg, by = "name") %>%
select(name, everything()) %>%
arrange(desc(degree_centrality))
#Add the frequency of each abstract word for reference.
freq <- abstract_df %>%
group_by(word) %>%
count() %>%
rename("name" = "word")
abstract_stats <- left_join(abstract_stats, freq, by = "name")
kable(abstract_stats[1:20,])
| name | ec | betweenness | degree_centrality | n |
|---|---|---|---|---|
| climate | 1.0000000 | 1352220.8 | 7542 | 2000 |
| change | 0.9757264 | 1006650.8 | 6955 | 1067 |
| results | 0.9038545 | 429570.1 | 5480 | 408 |
| data | 0.8784773 | 360950.4 | 5142 | 439 |
| study | 0.8758785 | 356945.9 | 5120 | 404 |
| mountain | 0.8678757 | 358297.8 | 5029 | 415 |
| temperature | 0.8613191 | 307638.5 | 4927 | 720 |
| western | 0.8583111 | 306690.7 | 4846 | 437 |
| model | 0.8476313 | 301725.0 | 4809 | 597 |
| species | 0.8463399 | 288901.3 | 4808 | 772 |
| based | 0.8450738 | 267328.7 | 4672 | 307 |
| future | 0.8461055 | 247893.2 | 4638 | 437 |
| conditions | 0.8370237 | 243106.6 | 4548 | 326 |
| effects | 0.8384379 | 222812.7 | 4510 | 359 |
| water | 0.8230651 | 239067.1 | 4497 | 578 |
| potential | 0.8292180 | 218751.1 | 4437 | 269 |
| range | 0.8253281 | 225974.9 | 4417 | 342 |
| precipitation | 0.8122574 | 199892.5 | 4302 | 646 |
| forest | 0.8149408 | 201270.7 | 4293 | 544 |
| models | 0.8086865 | 187872.5 | 4218 | 336 |
What are the least well-connected words in abstracts?
kable(tail(abstract_stats, 20))
| name | ec | betweenness | degree_centrality | n | |
|---|---|---|---|---|---|
| 8242 | deliveries | 0.0180357 | 0 | 46 | 1 |
| 8243 | modsim | 0.0180357 | 0 | 46 | 1 |
| 8244 | incapable | 0.0180357 | 0 | 46 | 1 |
| 8245 | arranged | 0.0229174 | 0 | 45 | 1 |
| 8246 | arguing | 0.0177760 | 0 | 45 | 1 |
| 8247 | surrounded | 0.0177760 | 0 | 45 | 1 |
| 8248 | annular | 0.0183722 | 0 | 41 | 1 |
| 8249 | unseasonably | 0.0183722 | 0 | 41 | 1 |
| 8250 | epitomize | 0.0198688 | 0 | 39 | 1 |
| 8251 | nw | 0.0137711 | 0 | 39 | 1 |
| 8252 | behave | 0.0137711 | 0 | 39 | 1 |
| 8253 | rclimdex | 0.0180382 | 0 | 36 | 1 |
| 8254 | evidencing | 0.0180382 | 0 | 36 | 1 |
| 8255 | monatan | 0.0176755 | 0 | 34 | 1 |
| 8256 | entirety | 0.0176755 | 0 | 34 | 1 |
| 8257 | tenures | 0.0152463 | 0 | 33 | 2 |
| 8258 | comment | 0.0117125 | 0 | 31 | 1 |
| 8259 | papers | 0.0117125 | 0 | 31 | 1 |
| 8260 | editor | 0.0117125 | 0 | 31 | 1 |
| 8261 | facets | 0.0117125 | 0 | 31 | 1 |
Plot a network of abstract words: (this is words that show up together 70 or more times - totally arbitrary)
set.seed(1234)
abstract_word_pairs %>%
filter(n >= 70) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "springgreen4") +
geom_node_point(size = 2) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()